/**********************************************************************/
/*
   Title: clean_susenas_finance_mar18.do
   Author: Clotaire Boyer, Robbie Dulin
   Created: 23 Feb 2021
   Description: Cleans raw SUSENAS data, creates outcome variables for
		 March 2018 SUSENAS, and saves dta files.
*/
/**********************************************************************/

/*----------------------------------------------------*/
               /* Section 0: Setup */
/*----------------------------------------------------*/

version 14.2
clear
set more off

global randomization2             "$dropboxbase/04_ResearchDesign/02 randomization/1705 Kab randomization v6 - 8.2 milion, treat most java - FINAL"

cap log close
local prefix: display %tdCYND td(`c(current_date)')
log using "$log/`prefix'_clean_susenas_finance_mar18", replace text

/*----------------------------------------------------*/
                /* Section: Blok 42 */
/*----------------------------------------------------*/

** financial transaction fee
	u "$importable/blok42_mar_18.dta", clear
	keep if KODE == 268 | KODE == 287
	gen fin_trans_fees = B42K5 if KODE == 268
	//summ fin_trans_fees

** cell phone/smartphone spending (purchase, repairs, accessories)
	gen cell_phone_spend = B42K5 if KODE == 287
	//summ cell_phone_spend

** colapse to HH level
	collapse (sum) fin_trans_fees cell_phone_spend, by(URUT)
	//summ fin_trans_fees if fin_trans_fees > 0
	//summ cell_phone_spend if cell_phone_spend > 0

** save
	tempfile blok42
	save `blok42'


/*----------------------------------------------------*/
/* Section 3: Create outcome variables from individual survey */
/*----------------------------------------------------*/

u "${importable}/kor18ind_revisi_diseminasi.dta", clear


	*  Recovering the numnber of small buisnesses in SUESENA 2018 (704-705 in 2019)

	//tab  R705
	gen smallbui_sus19_type1_ind_mar18 = R805 == 1
	replace smallbui_sus19_type1_ind_mar18 = 0 if R804 <= 6 // removing farmers
	replace smallbui_sus19_type1_ind_mar18 = 0 if R804 == 25
	//summ smallbui_sus19_type1_ind_mar18

	gen smallbui_sus19_type2_ind_mar18 = R805 == 2
	replace smallbui_sus19_type2_ind_mar18 = 0 if R804 <= 6
	replace smallbui_sus19_type2_ind_mar18 = 0 if R804 == 25
	//summ smallbui_sus19_type2_ind_mar18

	gen smallbui_sus19_type3_ind_mar18 = R805 == 3
	replace smallbui_sus19_type3_ind_mar18 = 0 if R804 <= 6
	replace smallbui_sus19_type3_ind_mar18 = 0 if R804 == 25
	//summ smallbui_sus19_type3_ind_mar18

	gen smallbui_sus19_all_ind_mar18 = smallbui_sus19_type1_ind_mar18
	replace smallbui_sus19_all_ind_mar18 = 1 if smallbui_sus19_type2_ind_mar18 == 1
	replace smallbui_sus19_all_ind_mar18 = 1 if smallbui_sus19_type2_ind_mar18 == 1
	//summ smallbui_sus19_all_ind_mar18

* health complaints
//tab  R1002
gen health_complaints_ind = R1002 == 1

* female head of household
//tab  R403 R405 if R403 == 1
gen female_KRT = R403 == 1 & R405 == 2 if R403 == 1
//summ female_KRT

* dummy for whether individual has savings account
//tab  R717
gen savings_ind = R717 == 1 if R717 != .

* own cell phone
//tab  R714
recode R714 (5 = 0), gen(cell_phone)

* use internet
//tab  R716
recode R716 (5 = 0), gen(internet_use)

* Head of household education levels
//tab  R613
// School classification descriptions from: https://mikrodata.bps.go.id/mikrodata/index.php/catalog/814/datafile/F1#page=F4&tab=data-dictionary
* Elementary school
// dummy if hh member has elementary edu or equivalent: (paket A, SDLB, SD, MI)
gen elementary = inlist(R613, 1, 2, 3, 4) if R403 == 1

* Junior high school
// dummy if hh member has junior high edu or equivalent: (paket B, SMPLB, SMP, MTs)
gen junior_high = inlist(R613, 5, 6, 7, 8) if R403 == 1

* High school
// dummy if hh member has high school edu or equivalent, or vocational school: (paket C, SMLB, SMA, MA, SMK, MAK)
gen high_school = inlist(R613, 9, 10, 11, 12, 13, 14) if R403 == 1

* Tertiary edu
// dummy if hh member has tertiary edu: (D1/D2, D3, D4, S1, S2, S3)
gen tertiary = inlist(R613, 15, 16, 17, 18, 19, 20) if R403 == 1
//summ elementary junior_high high_school tertiary

* children
//summ R407
gen child = R407 < 18

** save individual dataset
preserve
keep R101 R102 R105 savings_ind cell_phone internet_use ///
FWT smallbui_sus19_all_ind_mar18 smallbui_sus19_type1_ind_mar18 ///
smallbui_sus19_type2_ind_mar18 smallbui_sus19_type3_ind_mar18
collapse (mean) smallbui_sus19_all_ind_mar18 smallbui_sus19_type1_ind_mar18 ///
smallbui_sus19_type2_ind_mar18 smallbui_sus19_type3_ind_mar18 savings_ind_mar18 = savings_ind ///
cell_phone_mar18 = cell_phone internet_use_mar18 = internet_use [aw = FWT], by(R101 R102 R105)
//summ
save "$cleaned/finance/susenas_mar18_finance_ind.dta", replace
restore

* collapse number by household
collapse (sum) smallbui_sus19_all_ind_mar18 smallbui_sus19_type1_ind_mar18 ///
smallbui_sus19_type2_ind_mar18 smallbui_sus19_type3_ind_mar18 ///
savings_hh = savings_ind internet_use cell_phone female_KRT elementary ///
junior_high high_school ///
tertiary health_complaints = health_complaints_ind children = child, by(URUT)

* dummy for hh internet use
di _N
//summ internet_use
gen internet_use_hh = internet_use > 0
drop internet_use
rename internet_use_hh internet_use
//tab  internet_use

* dummy for hh cell phone ownership
//summ cell_phone
gen cell_phone_hh = cell_phone > 0
drop cell_phone
rename cell_phone_hh cell_phone
//tab  cell_phone

* dummy for any health complaints
gen health_complaint_hh = health_complaints > 0 & health_complaints != .
//summ health_complaint_hh
drop health_complaints

* dummy for any savings account in household
gen any_savings = savings_hh > 0
la var any_savings				"HH member has savings account"
drop savings_hh
//summ

	* Small buism variables = 1 if at least one hh member is part of such business

	gen smallbui_sus19_all = (smallbui_sus19_all_ind_mar18 >= 1) if smallbui_sus19_all_ind_mar18<.
	drop smallbui_sus19_all_ind_mar18

	gen smallbui_sus19_type1 = (smallbui_sus19_type1_ind_mar18 >= 1) if smallbui_sus19_type1_ind_mar18<.
	drop smallbui_sus19_type1_ind_mar18

	gen smallbui_sus19_type2 = (smallbui_sus19_type2_ind_mar18 >= 1) if smallbui_sus19_type2_ind_mar18<.
	drop smallbui_sus19_type2_ind_mar18

	gen smallbui_sus19_type3 = (smallbui_sus19_type3_ind_mar18 >= 1) if smallbui_sus19_type3_ind_mar18<.
	drop smallbui_sus19_type3_ind_mar18

tempfile kor18ind
qui save `kor18ind'

/*----------------------------------------------------*/
/* Section 4: Create variables from household survey */
/*----------------------------------------------------*/

// load household survey
u "${importable}/kor18rt_diseminasi.dta", clear

* merge in individual data
merge  1:1  URUT using "`kor18ind'"
drop _m

* merge with blok 42 data
merge 1:1 URUT using `blok42'
drop _m


* Merge in kabupaten names (treatment and control districts identified by kabu name, so need name to merge)
preserve
import excel "${importable}/province_names.xlsx", firstrow clear
rename KodeProvinsi R101
rename KodeKabupaten R102
destring R101, replace
destring R102, replace
rename NamaKabupaten namakabupaten
rename NamaProvinsi Provinsi
tempfile kabu_names
save `kabu_names'
restore

merge  m:1 R101 R102 using "`kabu_names'"
assert _m != 1
drop _m

* dummy for HH receive BPNT
gen bpnt_ever = R1604 == 1

* dummy for HH receive Raskin
gen raskin_any = R1601 == 1

* dummy for receive any subsidy
gen receive_subsidy = bpnt_ever == 1 | raskin_any == 1
//summ receive_subsidy

* log per capita consumption (monthly)
//summ EXP_CAP
gen log_cons = ln(EXP_CAP)

* per capita consumption, in levels (monthly)
gen cons = EXP_CAP

* KKS card
//tab  R1608
gen kks = R1608 == 1 | R1608 == 2
//tab  kks

* PKH recipient
//tab  R1609
recode R1609 (5 = 0), gen(pkh_receipt)
//tab  pkh_receipt

* where to receive PKH benefits
//tab  R1610C
gen pkh_post = R1610C == 1 if pkh_receipt == 1
gen pkh_atm = R1610C == 2 if pkh_receipt == 1
gen pkh_bank = R1610C == 3 if pkh_receipt == 1
gen pkh_agent = R1610C == 4 if pkh_receipt == 1
gen pkh_other = R1610C == 5 if pkh_receipt == 1
//summ pkh*


	gen phk_other_location=pkh_bank
	replace phk_other_location=1 if pkh_post==1
	replace phk_other_location=1 if pkh_atm==1
	replace phk_other_location=1 if pkh_other==1


* any financial transaction fees
//summ fin_trans_fees
gen any_fin_trans_fee = fin_trans_fees > 0 & fin_trans_fees != .
drop fin_trans_fees
rename any_fin_trans_fee fin_trans_fees
//tab  fin_trans_fees

* any cell phone spending
//summ cell_phone_spend
gen cell_phone_buy = cell_phone_spend > 0 & cell_phone_spend != .
//tab  cell_phone_buy
drop cell_phone_spend

* receive credit from any source
//tab  R1701A
gen credit = R1701A == 1 | R1701B == 1 | R1701C == 1 | R1701D == 1 | R1701E == 1 | ///
	R1701F == 1 | R1701G == 1 | R1701H == 1 | R1701I == 1 | R1701J == 1
//tab  credit

* receive credit from KUR
gen credit_KUR = R1701A == 1
//tab  credit_KUR

* receive credit from commercial banks
gen credit_bank = R1701B == 1
//tab  credit_bank

* receive credit from rural credit banks (BPR)
gen credit_BPR = R1701C == 1
//tab  credit_BPR

* receive credit from coop
gen credit_coop = R1701D == 1
//tab  credit_coop

* receive individual loans with interest
gen credit_loan = R1701E == 1
//tab  credit_loan

* receive credit from pawnshops
gen credit_pawn = R1701F == 1
//tab  credit_pawn

* receive credit from leasing companies
gen credit_lease = R1701G == 1
//tab  credit_lease

* receive credit from KUBE
gen credit_KUBE = R1701H == 1
//tab  credit_KUBE

* receive credit from village owned enterprises
gen credit_BUMDES = R1701I == 1
//tab  credit_BUMDES

* receve credit from other source
gen credit_other = R1701J == 1
//tab  credit_other

* receive social assistance from local govt
//tab  R1611
gen local_gov_aid = R1611 == 1

* amount of social assistance from local govt received
//summ R1612A_* R1612B_I
egen local_aid_rp = rowtotal(R1612A_* R1612B_I)
// bysort local_gov_aid: summ local_aid_rp

* num HH members
gen num_in_hh = R301



/*----------------------------------------------------*/
                /* Section: Outliers */
/*----------------------------------------------------*/

local continuous_vars "local_aid_rp"

** windsorize to 0.5 and 99.5 percentiles of continuous variables
	foreach var of varlist `continuous_vars' {
		_pctile `var', p(0.5 99.5)
		qui gen `var'W = `var'
		replace `var'W = `r(r1)' if `var' < `r(r1)'
		replace `var'W = `r(r2)' if `var' > `r(r2)' & `var' != .
		summ `var' `var'W
	}

** Remove outliers with z-scores of 12 and above
	* Tag outliers with z-scores 12 and above
	foreach var of varlist `continuous_vars' {
	  qui summ `var'
	  qui gen `var'_z = abs((`var' - `r(mean)') / `r(sd)')
	  // extremes `var'_z `var', n(20) high
	  quietly count if `var'_z >= 12 & `var'_z != .
	  di "`var' has `r(N)' values greater than 12 sd from the mean"
	  qui gen `var'_tag = 1 if `var'_z >= 12 & `var'_z != .
	}

	* set tagged values to missing
	// for now: strictly set all 12> to missing
	foreach var of varlist `continuous_vars' {
		quietly count if `var'_tag == 1

		// if tagged vals in var
		if `r(N)' > 0 {
			// copy original variable
			quietly gen `var'_orig = `var'

			// set tagged values to missing
		  quietly replace `var' = . if `var'_tag == 1
			local num_miss = `r(N)'
			di "`num_miss' values in `var' set to missing"
			extremes `var'_z `var'_orig `var', n(`num_miss')  high
		}
	}

	drop *z *tag
	qui ds *orig
	di "Original versions of vars with outliers removed: `r(varlist)'"

	* label new variables
	foreach var of varlist *W {
		local varname = substr("`var'", 1, length("`var'") - 1)
		local label : var label `varname'
		la var `var' "`label' (Winsorized)"
	}

	foreach var of varlist *_orig {
		local varname = substr("`var'", 1, length("`var'") - 5)
		local label : var label `varname'
		la var `var' "`label' (Raw)"
		local labelnew : var label `var'
	}


/*----------------------------------------------------*/
      /* Section: Merge treatment assignments */
/*----------------------------------------------------*/

* Merge in treatment and control status variables
// Load (real) treatment and control status dataset
preserve
u "$randomization2/treatments.dta", clear
keep namaprovinsi namakabupaten treated finalstratum
rename namaprovinsi Provinsi
keep if treated != .
// Need to create kota/kab variable to distinguish bewteen districts with same name in same province
/* I identify which districts are experimental using ⁨05_Intervention⁩/01 Impact Evaluation⁩/20190523_Kab Kota Phase Expansion.xlsx,
		which distinguishes between kota and kab for same-name districts. If a district is experimental in the above sheet, then I
		assume that it corresponds to the district with the same name in the randomization2 file. I then assign kab status to these
		districts to match the phase sheet. This distinguishes between districts in the same province with the same name. */
gen kab = 1 if inlist(namakabupaten, "BIMA", "GORONTALO", "PEKALONGAN", "SERANG")

// rename Kabupaten Pontianak to Mempawah (name change in 2014)
// Note: Kota Pontianak is A/B district, kabupaten Mempawah/Pontianak is IE district
replace namakabupaten = "MEMPAWAH" if namakabupaten == "PONTIANAK"

tempfile treatments
save `treatments'
restore

// abbreviate province names to match treatment assignment file
replace Provinsi = "KEP. BANGKA BELITUNG" if Provinsi == "KEPULAUAN BANGKA BELITUNG"
replace Provinsi = "D I YOGYAKARTA" if Provinsi == "DI YOGYAKARTA"
replace Provinsi = "NTB" if Provinsi == "NUSA TENGGARA BARAT"

// create kab variable to match
gen kab = 1 if inlist(KABU, 5206, 7502, 3326, 3604)

// rename KABU_NAME namakabupaten
merge  m:1 Provinsi namakabupaten kab using "`treatments'"
assert _m != 2
drop _m

// dummy for experimental districts
gen experimental = treated != .

// assert double named districts properly assigned
foreach dist of numlist 5272 3279 7571 3375 3673 {
	assert experimental == 0 if KABU == `dist'
}
foreach dist of numlist 6303 5206 7502 3326 3604 {
	assert experimental == 1 if KABU == `dist'
}

// save version with full sample
drop R14* R15* R16* R17* R18* R19* CATATAN NUINFORT*
//summ
save "$cleaned/finance/susenas_mar18_finance_hh_full.dta", replace
cap log close
